knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(phytools)
## Loading required package: ape
## Loading required package: maps
library(ape)
library(readxl)
library(svglite)
library(plyr)
##
## Attaching package: 'plyr'
## The following object is masked from 'package:maps':
##
## ozone
library(RColorBrewer)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ purrr::compact() masks plyr::compact()
## ✖ dplyr::count() masks plyr::count()
## ✖ dplyr::desc() masks plyr::desc()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::id() masks plyr::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::map() masks maps::map()
## ✖ dplyr::mutate() masks plyr::mutate()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ✖ dplyr::where() masks ape::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr)
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:plyr':
##
## mutate
##
## The following object is masked from 'package:ape':
##
## rotate
library(rstatix)
##
## Attaching package: 'rstatix'
##
## The following objects are masked from 'package:plyr':
##
## desc, mutate
##
## The following object is masked from 'package:stats':
##
## filter
library(AMR) #g.test
## Registered S3 method overwritten by 'AMR':
## method from
## plot.sir igraph
library(car)#multivatiate regression model
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(jtools)#multivatiate regression model
library(interactions)#multivatiate regression model
library(tidyverse)#logistic regression
library(caret)#logistic regression
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
Association of RT-qPCR RV Ct value with symptoms, age and sex of the individuals:
PCR_data_RV <- read_excel("PCR_Ct_symp_age.xlsx")
PCR_data_RV$CollMONTH <- as.factor(substr(PCR_data_RV$CollDate,1,7))
PCR_data_RV$sex <- as.factor(PCR_data_RV$sex)
PCR_data_RV$Symptoms <- as.factor(PCR_data_RV$Symptoms)
PCR_data_RV$AgeGroup <- as.factor(PCR_data_RV$AgeGroup)
PCR_data_RV$Age <- as.numeric(PCR_data_RV$Age)
PCR_data_RV$Ct <- as.numeric(PCR_data_RV$Ct)
#analysis Ct versus age:
PCR_data_RV %>% group_by(AgeGroup) %>% get_summary_stats(Ct, type="mean_sd")
## # A tibble: 5 × 5
## AgeGroup variable n mean sd
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 18-65 Ct 718 27.5 5.98
## 2 5-17 Ct 217 25.6 6.20
## 3 less5 Ct 209 26.9 5.37
## 4 more65 Ct 80 28.5 5.15
## 5 <NA> Ct 1 21.6 NA
#plotting Ct vs age groups
ordered_age_groups = factor(PCR_data_RV$AgeGroup, levels=c("less5", "5-17", "18-65", "more65"))
ggplot(data = PCR_data_RV, mapping = aes(x=ordered_age_groups, y=Ct, fill=ordered_age_groups)) + geom_violin() + geom_boxplot(width=.1, color="white") + geom_jitter(color="black", size=0.4, alpha=0.9)
## Warning: Removed 1 rows containing non-finite values (`stat_ydensity()`).
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
#Kruskal test Ct vs age:
res.age.kruskal <- kruskal_test(PCR_data_RV, Ct ~ AgeGroup)
res.age.kruskal
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <int> <dbl> <chr>
## 1 Ct 1226 23.2 3 0.000037 Kruskal-Wallis
#calculate effect size
res.age.effsize <- PCR_data_RV %>% kruskal_effsize(Ct ~ AgeGroup)
res.age.effsize #(0.01- < 0.06 (small effect), 0.06 - < 0.14 (moderate effect) and >= 0.14 (large effect))
## # A tibble: 1 × 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 Ct 1226 0.0165 eta2[H] small
#Wilcoxon’s test - pairwise comparisons between group:
pwc.Ct.age <- PCR_data_RV %>% wilcox_test(Ct ~ AgeGroup, p.adjust.method = "bonferroni")
pwc.Ct.age
## # A tibble: 6 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 Ct 18-65 5-17 718 217 92446. 0.0000303 0.000182 ***
## 2 Ct 18-65 less5 718 209 79129 0.229 1 ns
## 3 Ct 18-65 more65 718 80 25732. 0.127 0.762 ns
## 4 Ct 5-17 less5 217 209 19422 0.01 0.062 ns
## 5 Ct 5-17 more65 217 80 6101 0.000086 0.000516 ***
## 6 Ct less5 more65 209 80 6903 0.022 0.131 ns
#plotting results:
pwc.Ct.age <- pwc.Ct.age %>% add_xy_position(x = "AgeGroup")
ggboxplot(PCR_data_RV, x = "AgeGroup", y = "Ct") +
stat_pvalue_manual(pwc.Ct.age, hide.ns = TRUE) +
labs(
subtitle = get_test_label(res.age.kruskal, detailed = TRUE),
caption = get_pwc_label(pwc.Ct.age)
)
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).
#analysis Ct versus symptomatic:
#summary of the dataframe:
PCR_data_RV.NA <- na.omit(PCR_data_RV)
ggboxplot(PCR_data_RV.NA, x="Symptoms", y="Ct")
ggplot(data = PCR_data_RV.NA, mapping = aes(x=Symptoms, y=Ct, fill=Symptoms)) + geom_violin() + geom_boxplot(width=.1, color="white") + geom_jitter(color="black", size=0.4, alpha=0.9)
#Kruskal test
res.sym.kruskal <- kruskal_test(PCR_data_RV.NA, Ct ~ Symptoms)
res.sym.kruskal
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <int> <dbl> <chr>
## 1 Ct 962 45.6 1 1.46e-11 Kruskal-Wallis
#calculate effect size
res.sym.effsize <- PCR_data_RV.NA %>% kruskal_effsize(Ct ~ Symptoms)
res.sym.effsize #(0.01- < 0.06 (small effect), 0.06 - < 0.14 (moderate effect) and >= 0.14 (large effect))
## # A tibble: 1 × 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 Ct 962 0.0464 eta2[H] small
#Wilcoxon’s test to calculate pairwise comparisons between group levels with corrections for multiple testing:
pwc.ct <- PCR_data_RV.NA %>%
wilcox_test(Ct ~ Symptoms, p.adjust.method = "bonferroni")
pwc.ct
## # A tibble: 1 × 7
## .y. group1 group2 n1 n2 statistic p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl>
## 1 Ct Asymptomatic Symptomatic 307 655 127664 1.46e-11
#visualization: box plots with p-values:
pwc.ct <- pwc.ct %>% add_xy_position(x = "Symptoms")
ggboxplot(PCR_data_RV.NA, x = "Symptoms", y = "Ct") +
stat_pvalue_manual(pwc.ct, hide.ns = TRUE) +
labs(
subtitle = get_test_label(res.sym.kruskal, detailed = TRUE),
caption = get_pwc_label(pwc.ct)
)
#Age versus symptoms
PCR_data_RV.NA %>% group_by(Symptoms) %>%
get_summary_stats(Age, type="mean_sd")
## # A tibble: 2 × 5
## Symptoms variable n mean sd
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 Asymptomatic Age 307 27.2 20.6
## 2 Symptomatic Age 655 25.7 18.6
ggboxplot(PCR_data_RV.NA, x="Symptoms", y="Age")
ggplot(data = PCR_data_RV.NA, mapping = aes(x=Symptoms, y=Age, fill=Symptoms)) + geom_violin() + geom_boxplot(width=.1, color="white") + geom_jitter(color="black", size=0.4, alpha=0.9)
#Kruskal test
res.ageSYM.kruskal <- kruskal_test(PCR_data_RV.NA, Age ~ Symptoms)
res.ageSYM.kruskal
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <int> <dbl> <chr>
## 1 Age 962 0.486 1 0.486 Kruskal-Wallis
#multivariate multiple regression model to study Ct, age and symptomatic
lm1 <- lm(Ct~Symptoms+Age, data=PCR_data_RV.NA)
Anova(lm1)
## Anova Table (Type II tests)
##
## Response: Ct
## Sum Sq Df F value Pr(>F)
## Symptoms 1565 1 47.2194 1.141e-11 ***
## Age 127 1 3.8183 0.05098 .
## Residuals 31780 959
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot the model
avPlots(lm1)
#identify the formula:
summ(lm1, confint=TRUE, digits=3) #in this case the formula is: Ct= 27.892 + 0.018 x age - 2.715 x symptomatic (0=asymptomatic, 1=symptomatic: categorical variables are choose as baseline depending on alphabetical order)
## MODEL INFO:
## Observations: 962
## Dependent Variable: Ct
## Type: OLS linear regression
##
## MODEL FIT:
## F(2,959) = 26.049, p = 0.000
## R² = 0.052
## Adj. R² = 0.050
##
## Standard errors: OLS
## ---------------------------------------------------------------------
## Est. 2.5% 97.5% t val. p
## ------------------------- -------- -------- -------- -------- -------
## (Intercept) 27.896 27.071 28.721 66.384 0.000
## SymptomsSymptomatic -2.738 -3.520 -1.956 -6.872 0.000
## Age 0.019 -0.000 0.038 1.954 0.051
## ---------------------------------------------------------------------
#plot
interact_plot(lm1, pred=Age, modx=Symptoms, plot.points=TRUE, point.alpha=0.2, interval = TRUE, int.type = "confidence", int.width = .8)
## Warning: Age and Symptoms are not included in an interaction with one another in the
## model.
#verify normality of residuals
qqnorm(lm1$residuals)
qqline(lm1$residuals)
RV seasonality plots with the Washington State genomes:
#Loading the dataframe:
season_data_RV <- read_excel("RV-dataframe.xlsx")
season_data_RV$CollMONTH <- as.factor(substr(season_data_RV$CollDate,1,7))
season_data_RV$Genotype <- as.factor(season_data_RV$Genotype)
season_data_RV$sex <- as.factor(season_data_RV$IndSex)
season_data_RV$RVSpecie <- as.factor(season_data_RV$RVSpecie)
season_data_RV$Clinica <- as.factor(season_data_RV$Clinica)
season_data_RV$StudyPeriod <- as.factor(season_data_RV$StudyPeriod)
season_data_RV$AgeGroup <- as.factor(season_data_RV$AgeGroup)
season_data_RV$IndAge <- as.numeric(season_data_RV$IndAge)
season_data_RV$Ct <- as.numeric(season_data_RV$Ct)
season_data_RV$ZIPCODE <- as.factor(season_data_RV$ZIPCODE)
# ----->Bubble chart of seasonality of RV genotypes of all the species together. It is needed an Excel file containing a column for the sequence name and a column for the genotype:
condensed_season_data_RV <- ddply(season_data_RV,.(Genotype,CollMONTH),nrow)
ggplot(condensed_season_data_RV, aes(x= factor(CollMONTH), y=Genotype, size=V1)) + geom_point(alpha=0.5) + scale_y_discrete(limits=rev(c("A1B", "A2", "A7", "A9", "A11", "A12", "A13", "A16", "A18", "A20", "A21", "A22", "A23", "A24", "A25", "A28", "A29", "A30", "A31", "A32", "A33", "A34", "A38", "A39", "A45", "A46", "A47", "A49", "A53", "A54", "A58", "A59", "A60", "A61", "A62", "A63", "A64", "A66", "A67", "A68", "A73", "A78", "A80", "A85", "A94", "A101", "A105", "B3", "B4", "B6", "B14", "B27", "B42", "B48", "B70", "B72", "B83", "B91", "B92", "B100", "B101", "Bpat107", "C1", "C2", "C3", "C4", "C6", "C7", "C8", "C9", "C11", "C13", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C23", "C25", "C26", "C27", "C28", "C29", "C31", "C33", "C34", "C35", "C36", "C40", "C42", "C43", "C44", "C46", "C53", "C55", "C56"))) + xlab("Collection Date") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 1 rows containing missing values (`geom_point()`).
# ----->Barplot of the seasonality of RV species by month of sample collection.
n <- length(unique(season_data_RV$RVSpecie))
qual_col_pals = brewer.pal.info[brewer.pal.info$category== 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
ggplot(season_data_RV, aes(x=CollMONTH, fill=RVSpecie)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(y="Proportion", x="Collection date (month)") + scale_fill_manual(values = sample(col_vector, n))
# ----->Barplot of the seasonality of RV-A genotypes by month of sample collection.
season_data_RVA <- subset(season_data_RV, season_data_RV$RVSpecie=="A")
nA <- length(unique(season_data_RVA$Genotype))
qual_col_pals = brewer.pal.info[brewer.pal.info$category== 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
ggplot(season_data_RVA, aes(x=CollMONTH, fill=Genotype)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(y="Proportion", x="Collection date (month)") + scale_fill_manual(values = sample(col_vector, nA), breaks = c("A1B", "A2", "A7", "A9", "A11", "A12", "A13", "A16", "A18", "A20", "A21", "A22", "A23", "A24", "A25", "A28", "A29", "A30", "A31", "A32", "A33", "A34", "A38", "A39", "A45", "A46", "A47", "A49", "A53", "A54", "A58", "A59", "A60", "A61", "A62", "A63", "A64", "A66", "A67", "A68", "A73", "A78", "A80", "A85", "A94", "A101", "A105")) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1))
# ----->Barplot of the seasonality of RV-B genotypes by month of sample collection.
season_data_RVB <- subset(season_data_RV, season_data_RV$RVSpecie=="B")
nB <- length(unique(season_data_RVB$Genotype))
qual_col_pals = brewer.pal.info[brewer.pal.info$category== 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
ggplot(season_data_RVB, aes(x=CollMONTH, fill=Genotype)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(y="Proportion", x="Collection date (month)") + scale_fill_manual(values = sample(col_vector, nB), breaks = c("B3", "B4", "B6", "B14", "B27", "B42", "B48", "B70", "B72", "B83", "B91", "B92", "B100", "B101", "Bpat107")) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1))
# ----->Barplot of the seasonality of RV-C genotypes by month of sample collection.
season_data_RVC <- subset(season_data_RV, season_data_RV$RVSpecie=="C")
nC <- length(unique(season_data_RVC$Genotype))
qual_col_pals = brewer.pal.info[brewer.pal.info$category== 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
ggplot(season_data_RVC, aes(x=CollMONTH, fill=Genotype)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(y="Proportion", x="Collection date (month)") + scale_fill_manual(values = sample(col_vector, nC), breaks = c("C1", "C2", "C3", "C4", "C6", "C7", "C8", "C9", "C11", "C13", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C23", "C25", "C26", "C27", "C28", "C29", "C31", "C33", "C34", "C35", "C36", "C40", "C42", "C43", "C44", "C46", "C53", "C55", "C56")) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1))
Assocaition of RV species and genotypes with with clinical and epidemiological characteristics:
#RV species versus individuals age
ggplot(season_data_RV, aes(x=RVSpecie, y=IndAge, fill=RVSpecie)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="white", alpha=0.2) + geom_jitter(color="black", size=0.4, alpha=0.9) + theme_minimal()
## Warning: Removed 477 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 477 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 477 rows containing missing values (`geom_point()`).
#RV genotypes within RV species versus individuals age
season_data_RVA <- subset(season_data_RV, subset = RVSpecie == "A")
season_data_RVB <- subset(season_data_RV, subset = RVSpecie == "B")
season_data_RVC <- subset(season_data_RV, subset = RVSpecie == "C")
ggplot(season_data_RVA, aes(x=Genotype, y=IndAge)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 330 rows containing non-finite values (`stat_ydensity()`).
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning: Removed 330 rows containing non-finite values (`stat_boxplot()`).
ggplot(season_data_RVB, aes(x=Genotype, y=IndAge)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 54 rows containing non-finite values (`stat_ydensity()`).
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning: Removed 54 rows containing non-finite values (`stat_boxplot()`).
ggplot(season_data_RVC, aes(x=Genotype, y=IndAge)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 93 rows containing non-finite values (`stat_ydensity()`).
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning: Removed 93 rows containing non-finite values (`stat_boxplot()`).
#Age Group and sex by RV species
df_clinica <- season_data_RV[!is.na(season_data_RV$IndAge),] %>% filter(IndSex!='U')
ggplot(df_clinica, aes(x=RVSpecie, fill=AgeGroup)) + geom_bar(position = "dodge")
ggplot(df_clinica, aes(x=AgeGroup, fill=RVSpecie)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-64", "more65"))
ggplot(df_clinica, aes(x=RVSpecie, fill=IndSex)) + geom_bar(position = "dodge")
#statistical test
df_clinica_sex.spe <- as.matrix(table(df_clinica$RVSpecie, df_clinica$IndSex))
g.test(df_clinica_sex.spe)
##
## G-test of independence
##
## data: df_clinica_sex.spe
## X-squared = 2.38, p-value = 0.3042
df_clinica_age.spe <- as.matrix(table(df_clinica$RVSpecie, df_clinica$AgeGroup))
g.test(df_clinica_age.spe)
## Warning in g.test(df_clinica_age.spe): G-statistic approximation may be
## incorrect due to E < 5
##
## G-test of independence
##
## data: df_clinica_age.spe
## X-squared = 26.226, p-value = 0.000202
#Symptoms versus RV species
df_clinica.SYM <- df_clinica[!is.na(df_clinica$Clinica),]
ggplot(df_clinica.SYM, aes(x=Clinica, fill=RVSpecie)) + geom_bar(position = "dodge")
#statistical test
df_clinica_sym.spe <- as.matrix(table(df_clinica.SYM$RVSpecie, df_clinica.SYM$Clinica))
g.test(df_clinica_sym.spe)
##
## G-test of independence
##
## data: df_clinica_sym.spe
## X-squared = 0.21346, p-value = 0.8988
#Symptoms by age group per RV species
df_clinica.SYMA <- subset(df_clinica.SYM, subset = RVSpecie == "A")
df_clinica.SYMB <- subset(df_clinica.SYM, subset = RVSpecie == "B")
df_clinica.SYMC <- subset(df_clinica.SYM, subset = RVSpecie == "C")
ggplot(df_clinica.SYMA, aes(x=AgeGroup, fill=Clinica)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-64", "more65"))
df_clinica.SYMA_sym.age <- as.matrix(table(df_clinica.SYMA$Clinica, df_clinica.SYMA$AgeGroup))
g.test(df_clinica.SYMA_sym.age)
## Warning in g.test(df_clinica.SYMA_sym.age): G-statistic approximation may be
## incorrect due to E < 5
##
## G-test of independence
##
## data: df_clinica.SYMA_sym.age
## X-squared = 1.9512, p-value = 0.5826
ggplot(df_clinica.SYMB, aes(x=AgeGroup, fill=Clinica)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-64", "more65"))
df_clinica.SYMB_sym.age <- as.matrix(table(df_clinica.SYMB$Clinica, df_clinica.SYMB$AgeGroup))
g.test(df_clinica.SYMB_sym.age)
## Warning in g.test(df_clinica.SYMB_sym.age): G-statistic approximation may be
## incorrect due to E < 5
##
## G-test of independence
##
## data: df_clinica.SYMB_sym.age
## X-squared = 2.9685, p-value = 0.3965
ggplot(df_clinica.SYMC, aes(x=AgeGroup, fill=Clinica)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-64", "more65"))
df_clinica.SYMC_sym.age <- as.matrix(table(df_clinica.SYMC$Clinica, df_clinica.SYMC$AgeGroup))
g.test(df_clinica.SYMC_sym.age)
## Warning in g.test(df_clinica.SYMC_sym.age): G-statistic approximation may be
## incorrect due to E < 5
##
## G-test of independence
##
## data: df_clinica.SYMC_sym.age
## X-squared = 5.3358, p-value = 0.1488
#logistic regressions to evaluate association with geographic location and individuals age:
season_data_RV %>% group_by(ZIPCODE) %>%
get_summary_stats(IndAge, type="mean_sd") %>% print(n = 25)
## # A tibble: 24 × 5
## ZIPCODE variable n mean sd
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 98001 IndAge 32 30.5 19.1
## 2 98003 IndAge 1 41 NA
## 3 98006 IndAge 1 44 NA
## 4 98007 IndAge 99 30.7 20.5
## 5 98021 IndAge 1 53 NA
## 6 98036 IndAge 21 25.9 20.5
## 7 98037 IndAge 3 32 31.0
## 8 98103 IndAge 1 52 NA
## 9 98104 IndAge 7 40.9 16.3
## 10 98105 IndAge 1 69 NA
## 11 98107 IndAge 4 13.2 9.18
## 12 98109 IndAge 4 33 25.6
## 13 98116 IndAge 31 29.3 18.7
## 14 98118 IndAge 15 28.5 21.2
## 15 98133 IndAge 106 26.2 20.3
## 16 98134 IndAge 43 29 17.5
## 17 98144 IndAge 1 24 NA
## 18 98155 IndAge 5 46.2 9.76
## 19 98195 IndAge 80 28.1 21.5
## 20 98201 IndAge 14 22 17.6
## 21 98272 IndAge 10 21.5 23.2
## 22 98908 IndAge 5 9.2 15.6
## 23 99336 IndAge 4 21.5 13.2
## 24 <NA> IndAge 37 12.3 17.4
#plotting
season_data_RV_clean <- season_data_RV[!is.na(season_data_RV$ZIPCODE),]
ggplot(season_data_RV_clean, aes(x=ZIPCODE, y=IndAge)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
#if leaving zip code with more than 5 cases reported:
season_data_RV.5 <- season_data_RV %>%
group_by(ZIPCODE) %>%
filter(n() > 5) %>%
na.omit(season_data_RV.5$ZIPCODE)
ggplot(season_data_RV.5, aes(x=ZIPCODE, y=IndAge)) + geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
#multivariate multiple regression model to study Ct, symptoms and RV SPECIES (using data frame of sequenced samples):
season_data_RV_clean <- season_data_RV[!is.na(season_data_RV$Clinica),]
lm_reg1 <- lm(Ct~Clinica+RVSpecie, season_data_RV_clean)
Anova(lm_reg1)
## Anova Table (Type II tests)
##
## Response: Ct
## Sum Sq Df F value Pr(>F)
## Clinica 87.4 1 5.1430 0.02381 *
## RVSpecie 38.3 2 1.1276 0.32473
## Residuals 7661.8 451
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot the model
avPlots(lm_reg1)
#identify the formula:
summ(lm_reg1, confint=TRUE, digits=3) #in this case the formula is: Ct= 27.892 + 0.018 x age - 2.715 x symptomatic (0=asymptomatic, 1=symptomatic: categorical variables are choose as baseline depending on alphabetical order)
## MODEL INFO:
## Observations: 455
## Dependent Variable: Ct
## Type: OLS linear regression
##
## MODEL FIT:
## F(3,451) = 2.417, p = 0.066
## R² = 0.016
## Adj. R² = 0.009
##
## Standard errors: OLS
## --------------------------------------------------------------------
## Est. 2.5% 97.5% t val. p
## ------------------------ -------- -------- -------- -------- -------
## (Intercept) 23.525 22.674 24.376 54.344 0.000
## ClinicaSymptomatic -1.027 -1.917 -0.137 -2.268 0.024
## RVSpecieB 0.894 -0.318 2.105 1.450 0.148
## RVSpecieC 0.326 -0.500 1.152 0.775 0.439
## --------------------------------------------------------------------
#verify normality of residuals
qqnorm(lm_reg1$residuals)
qqline(lm_reg1$residuals)
#Ct value and the genotypes (with more than 5 cases detected):
season_data_RV_CT.5 <- season_data_RV %>% group_by(Genotype) %>% filter(n() > 5) %>% filter(Ct!="NA")
season_data_RV_CT.5$Genotype<- as.factor(season_data_RV_CT.5$Genotype)
#Kruskal test
res.gen.ct.kruskal <- kruskal_test(season_data_RV_CT.5, season_data_RV_CT.5$Ct ~ season_data_RV_CT.5$Genotype)
res.gen.ct.kruskal
## # A tibble: 36 × 7
## Genotype .y. n statistic df p method
## * <fct> <chr> <int> <dbl> <int> <dbl> <chr>
## 1 A101 season_data_RV_CT.5$Ct 35 84.6 35 0.00000538 Kruskal-Wal…
## 2 A11 season_data_RV_CT.5$Ct 4 84.6 35 0.00000538 Kruskal-Wal…
## 3 A1B season_data_RV_CT.5$Ct 79 84.6 35 0.00000538 Kruskal-Wal…
## 4 A2 season_data_RV_CT.5$Ct 4 84.6 35 0.00000538 Kruskal-Wal…
## 5 A20 season_data_RV_CT.5$Ct 7 84.6 35 0.00000538 Kruskal-Wal…
## 6 A22 season_data_RV_CT.5$Ct 6 84.6 35 0.00000538 Kruskal-Wal…
## 7 A23 season_data_RV_CT.5$Ct 5 84.6 35 0.00000538 Kruskal-Wal…
## 8 A24 season_data_RV_CT.5$Ct 5 84.6 35 0.00000538 Kruskal-Wal…
## 9 A25 season_data_RV_CT.5$Ct 15 84.6 35 0.00000538 Kruskal-Wal…
## 10 A28 season_data_RV_CT.5$Ct 2 84.6 35 0.00000538 Kruskal-Wal…
## # ℹ 26 more rows
#calculate effect size
res.gen.ct.effsize <- season_data_RV %>% kruskal_effsize(Ct ~ Genotype)
res.gen.ct.effsize #(0.01- < 0.06 (small effect), 0.06 - < 0.14 (moderate effect) and >= 0.14 (large effect))
## # A tibble: 1 × 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 Ct 1003 0.0869 eta2[H] moderate
ggboxplot(season_data_RV_CT.5, x = "Genotype", y = "Ct") + theme(axis.text.x = element_text(angle = 45, hjust = 1))